import random, math, datetime
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from datetime import datetime
from sklearn import preprocessing
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from fbprophet import Prophet
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from datetime import datetime as dt, timedelta
pd.set_option('display.max_colwidth', None)
The stock market is a place where investors, both individual and institutional, come together to buy and sell stocks. The stock market is oftern described as a highly volatile and random place and is highly sensitive to various catalysts but there is a method to the madness, investors who hope to profit from the stock market often employ various stock analysis methods to trade. They can be broadly classified into two, Fundamental and Technical analysis. Fundamental analysis, as the name suggests, focuses on the fundamental values of the company including factors like revenue, earnings, net profit, price-to-earnings ratio and earnings per share price to name a few to determine the future trajectory of a company's share price. This method requires deep knowledge and a holistic view of all the factors affecting the company. If executed correctly, it can lead to discovering profitable trades before everyone else, potentially leading to larger profits but it is prone to suffer from the randomness in the market, as well as biases often seen in human behavior related to investing and illogical decision making.
Conversely, technical analysis mainly studies the supply and demand of a stock within the market. Investors who use technical analysis believe that a stock’s historical performance indicates how the stock will perform in the future. Little attention is given to the fundamental values of the company. Technical analysis places heavy focus on the study of trends, charts and patterns of the stock price. Most people believe that fundamental analysis is a good method only on a long-term basis. However, for short-term and medium-term speculations, fundamental analysis is generally not suitable.
Our goal is to analyze historical stock price data for a set of 51 companies across multiple sectors and market-cap sizes. We first perform exploratory data analysis (EDA) including descriptive statistics to make sense and study the behavior of the data. We then leverage traditional probabilistic machine learning methods to forecast stock prices for a company. We also look at clustering these companies based on their trading patterns.
We obtain the historical stock data from yahoo finance using an open source python API, yfinance [1]. It takes a collection of stocker ticker symbols and returns the data in the form of a pandas DataFrame object. The collection of companies and their ticker symbols are stored in a python table, also known as a dictionary. The dataset contains the opening price, high, low and closing price of the stocks for each company along with the volume. The volume is the number of stocks that are traded on a particular day. The rows make up each day's values while the columns in the dataframe contain the above mentioned prices.
Here is a sample of the dataset, we display the stock price of Apple Inc. (Ticker: AAPL)
yf.download(['AAPL'], start="2020-12-31", end="2021-03-01").head()
companies_dict = {
'Amazon':'AMZN',
'Apple':'AAPL',
'Walgreen':'WBA',
'Northrop Grumman':'NOC',
'Boeing':'BA',
'Lockheed Martin':'LMT',
'McDonalds':'MCD',
'Intel':'INTC',
'Navistar':'NAV',
'IBM':'IBM',
'Texas Instruments':'TXN',
'MasterCard':'MA',
'Microsoft':'MSFT',
'General Electric':'GE',
'American Express':'AXP',
'Pepsi':'PEP',
'Coca Cola':'KO',
'Johnson & Johnson':'JNJ',
'Toyota':'TM',
'Honda':'HMC',
'Tesla':'TSLA',
'Sony':'SNE',
'Exxon':'XOM',
'Chevron':'CVX',
'Valero Energy':'VLO',
'Ford':'F',
'Bank of America':'BAC',
'Nio':'NIO',
'Pioneer Natural Resources': 'PXD',
'Volkswagen': 'VWAGY',
'Panasonic': 'PCRFY',
'JP Morgan Chase': 'JPM',
'Alphabet': 'GOOGL',
'Facebook': 'FB',
'Nvidia': 'NVDA',
'Netflix': 'NFLX',
'Disney': 'DIS',
'Alibaba': 'BABA',
'Visa': 'V',
'Walmart': 'WMT',
'Target': 'TGT',
'Home Depot': 'HD',
'McDonalds': 'MCD',
'Dominos': 'DPZ',
'Chipotle': 'CMG',
'Procter & Gamble': 'PG',
'Moderna': 'MRNA',
'Novartis': 'NVS',
'Merck': 'MRK',
'AT&T': 'T',
'Verizon': 'VZ',
'3M': 'MMM',
}
data = yf.download(list(companies_dict.values()), start="2020-01-01", end="2021-01-01")
Checking for any null values in the dataframe
print(data.isnull().values.any())
Visualizing stock prices as a graph. We can select any tickers on the right to compare prices and trajectories of specific companies.
def plot_all_stocks(data):
fig = go.Figure()
# add a scatter trace for every column
for col in data['Adj Close'].columns:
fig.add_scatter(x=data.index, y=data['Adj Close'][col], name=col)
# change the scale to logarithmic and add title
fig.update_layout(
yaxis=dict(type="log"),
title=f"Stock prices for {data.index[0].strftime('%B %Y')} - " \
+ f"{data.index[-1].strftime('%B %Y')}"
)
return fig
plot_all_stocks(data).show()
Cluster analysis, in general terms, is the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar (in some sense) to each other than to those in other groups (clusters). It is a main task of exploratory data analysis, and a common technique for statistical data analysis [2].
Here we try to cluster the companies based on their trading pattern, we use the daily return along with the daily volume for the feature vector. We first analyze the data to be used as feature vector for the clustering algorithm.
The daily return is measured as Return = Closing price - Opening price Let's take a look at two companies and their returns
def plot_open_close(data, tickers: list, no_prev_days=30):
subps = make_subplots(rows=1, cols=len(tickers))
for i, ticker in enumerate(tickers):
subps.add_trace(go.Scatter(x=data.index, y=data.iloc[-30:]['Open'][ticker], name=f'{ticker} Open'), row=1, col=i+1)
subps.add_trace(go.Scatter(x=data.index, y=data.iloc[-30:]['Close'][ticker], name=f'{ticker} Close'), row=1, col=i+1)
subps.update_yaxes(title_text=f"stock price: {ticker}", row=1, col=i+1)
subps.update_layout(
title=f"Stock prices for last {no_prev_days} days")
return subps
def plot_price_difference(data, tickers: list, no_prev_days=30):
subps = make_subplots(rows=1, cols=len(tickers))
for i, ticker in enumerate(tickers):
subps.add_trace(go.Scatter(x=data.index, y=data.iloc[-30:]['Close'][ticker] - data.iloc[-30:]['Open'][ticker], name=f'{ticker} Open'), row=1, col=i+1)
subps.update_yaxes(title_text=f"daily stock price change: {ticker}", row=1, col=i+1)
subps.update_layout(
title=f"Stock price change for last {no_prev_days} days")
return subps
plot_open_close(data, ['AMZN', 'CVX']).show()
plot_price_difference(data, ['AMZN', 'CVX']).show()
If the closing price is higher that opening, that means that day's return is positive, meaning if you had invested in the company on opening, you would've profited from the investment. Instead, if the stock price fell, the daily return would be negative and you'd incur a loss on that day.
One immediate thing we notice is that the scale of the returns varies among different companies For example Amazon losing $50 on a day isn't much but if chevron lost that much amount, the company is bankrupt. This motivates us to either scale these values to a constant range or calculate percentage change in the price. We use min-max scaling on the daily returns to normalize the data.
normalized_data = (data-data.min())/(data.max()-data.min())
We then extract stock-wise opening and closing prices to calculate the normalized return
def get_daily_return(data):
stock_open = np.array(data['Open']).T # stock_open is numpy array of transpose of df['Open']
stock_close = np.array(data['Close']).T # stock_close is numpy array of transpose of df['Close']
return stock_close - stock_open
daily_return = get_daily_return(normalized_data)
plot_price_difference(normalized_data, ['AMZN', 'CVX']).show()
Similarly we take a look at volumes and also note that they need normalizing too
def plot_volumes(data, tickers: list, no_prev_days=30):
subps = make_subplots(rows=1, cols=len(tickers))
for i, ticker in enumerate(tickers):
subps.add_trace(go.Scatter(x=data.index, y=data.iloc[-30:]['Volume'][ticker], name=f'{ticker} Volume'), row=1, col=i+1)
subps.update_yaxes(title_text=f"daily volume: {ticker}", row=1, col=i+1)
subps.update_layout(
title=f"Stock trading volume for last {no_prev_days} days")
return subps
plot_volumes(data, ['AMZN', 'CVX']).show()
plot_volumes(normalized_data, ['AMZN', 'CVX']).show()
We now create our feature vector combining both daily returns and daily volumes, we explore two ways of creating it.
Since the vectors are in higher dimension i.e (total number of days we are analyzing), we perform dimensionality reduction to project into 2D space for visualization. For this purpose, we use PCA. We also cluster the data in the lower dimension too and we intend to pick the method that results in better clusters.
def get_daily_volume(data):
return np.array(data['Volume']).T # stock_volume is numpy array of transpose of df['Volume']
def get_stock_feature_vector(daily_return, daily_volume, method: str):
if method == 'product':
return np.multiply(daily_return, daily_volume)
elif method == 'concatenate':
return np.hstack((daily_return, daily_volume))
else:
raise Exception("Not a valid method")
def reduce_data(data, method='tsne'):
if method == 'pca':
reduced_data = PCA(n_components=2).fit_transform(data)
return [reduced_data[:,0], reduced_data[:,1]]
reduced_data = TSNE(n_components=2, perplexity=2).fit_transform(data)
return [reduced_data[:,0], reduced_data[:,1]]
def plot_reduced_data(reduced_datas: list):
subps = make_subplots(rows=1, cols=len(reduced_datas))
for i, reduced_data in enumerate(reduced_datas):
subps.add_trace(go.Scatter(x=reduced_data[0], y=reduced_data[1], mode='markers', name=f'feature vector: {i+1}'), row=1, col=i+1)
subps.update_layout(title="Feature vector projected to 2D")
return subps
daily_volume = get_daily_volume(normalized_data)
x1 = get_stock_feature_vector(daily_return, daily_volume, 'product')
x2 = get_stock_feature_vector(daily_return, daily_volume, 'concatenate')
reduced_data_pca_x1 = reduce_data(x1, 'pca')
reduced_data_pca_x2 = reduce_data(x2, 'pca')
plot_reduced_data([reduced_data_pca_x1, reduced_data_pca_x2]).show()
There are many approaches to clustering but we consider the following widely used clustering techniques;
The KMeans algorithm clusters data by trying to separate samples in $k$ groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares [3]. This algorithm requires the number of clusters to be specified. Since it is very simple to calculate, it scales well to large number of samples and has been used across a large range of application areas in many different fields.
The k-means algorithm divides a set of $n$ samples in dataset $X$ into $k$ disjoint clusters $C$, each described by the mean $\mu_i$ of the samples in the cluster, these means are commonly called the cluster centroids;
The K-means algorithm aims to choose centroids that minimise the within-cluster sum-of-squares objective as follows: $$\sum_{i=0}^{n}\min_{\mu_j \in C}(||X_i - \mu_j||^2)$$ where $X_i$ is a particular row from the dataset $X$ or dimensions $n * f$ where $f$ is the number of features.
A Gaussian mixture model is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of Gaussian distributions with unknown parameters. One can think of mixture models as generalizing k-means clustering to incorporate information about the covariance structure of the data as well as the centers of the latent Gaussians [4].
For a dataset $X$ with $f$ features, the Gaussian mixture model or GMM for short determines a mixture of $k$ Gaussian distributions (where $k$ is equivalent to the number of clusters), each having a certain mean and variance. GMM uses Expectation-Maximization to estimate these unknown means and variances for each of the gaussian distributions from the available data.
In brief, the EM algorithm attempts to find maximum likelihood estimates for models with latent variables. It works by first assuming random components (randomly centered on data points, often learned from k-means for better performance and computes for each point a probability of being generated by each component of the model, and then tweaking the parameters to maximize the likelihood of the data given those assignments. Repeating this process is guaranteed to always converge to a local optimum.
If our observations $X_i$ come from a mixture model with $K$ mixture components, the marginal probability distribution of $X_i$ is of the form: $$P(X_i = x) = \sum_{k=1}^K \pi_kP(X_i=x|Z_i=k)$$ where $Z_i\in\{1,\dots,K\}$ is the latent variable representing the mixture component for $X_i$, $P(X_i|Z_i)$ is the mixture component, and $\pi_k$ is the mixture proportion representing the probability that $X_i$ belongs to the $k$-th mixture component.
Let $N(\mu, \sigma^2)$ denote the probability distribution function for a normal random variable. In this scenario, we have that the conditional distribution $X_i|Z_i = k \sim N(\mu_k, \sigma_k^2)$ so that the marginal distribution of $X_i$ is: $$P(X_i = x) = \sum_{k=1}^K P(Z_i = k) P(X_i=x | Z_i = k) = \sum_{k=1}^K \pi_k N(x; \mu_k, \sigma_k^2)$$
the joint probability of observations $X_1,\ldots,X_n$ is therefore: $$P(X_1=x_1,\ldots,X_n=x_n) = \prod_{i=1}^n \sum_{k=1}^K \pi_k N(x_i; \mu_k, \sigma_k^2)$$
EM algorithm obtains the maximum likelihood estimates of $\pi_k, \mu_k$ and $\sigma_k^2$ given a data set of observations $\{x_1,\ldots, x_n\}$ [5].
In the E-step, we use the current value of the parameters $\theta_0$ to find the posterior distribution of the latent variables given by $P(Z|X,\theta_0)$. We then use this to find the expectation: $$E_{Z|X,\theta_0}\left [\log (P(X,Z|\theta)) \right] =\sum_Z P(Z|X,\theta_0) \log (P(X,Z|\theta))$$
In M-step, we determine the new parameter $\hat{\theta}$ by maximizing the expectation obtained in the above step. $$\hat{\theta} = \text{argmax}_{\theta} E_{Z|X,\theta_0}\left [\log (P(X,Z|\theta)) \right]$$
We empirically determined that $k=8$ i.e. we use k-means and GMM to cluster the data into 8 clusters.
class Cluster:
def __init__(self, data, companies_dict, cluster_method, n_clusters, reduced_data):
self.data = data
self.companies_dict = companies_dict
self.cluster_method = cluster_method
self.n_clusters = n_clusters
self.reduced_data = reduced_data
self.labels_df = self.get_clusters()
self.fig = self.plot_clusters()
self.cluster_df = self.get_cluster_data()
def get_clusters(self):
clustering_class = {
'spectral': SpectralClustering(n_clusters=self.n_clusters, n_jobs=-1),
'bayesian_gaussian_mixture': BayesianGaussianMixture(n_components=self.n_clusters, max_iter=1000, n_init=10),
'gaussian_mixture': GaussianMixture(n_components=self.n_clusters, max_iter=1000, n_init=10),
'k_means': KMeans(n_clusters=self.n_clusters, max_iter=1000),
'agglomerative': AgglomerativeClustering(self.n_clusters)
}
c = clustering_class[self.cluster_method]
labels_df = pd.DataFrame({'labels':c.fit_predict(self.data), 'companies':list(self.companies_dict.keys()), 'ticker_symbol': list(self.companies_dict.values())})
return labels_df
def plot_clusters(self):
self.labels_df['x'] = self.reduced_data[0]
self.labels_df['y'] = self.reduced_data[1]
self.labels_df['cluster'] = self.labels_df['labels'].astype(str)
fig = px.scatter(self.labels_df, x='x', y='y', color='cluster', hover_name=list(self.labels_df['companies']), width=800, height=400)
return fig
def get_cluster_data(self):
df = pd.DataFrame({'cluster': [i for i in range(self.n_clusters)],
'companies': [", ".join(self.labels_df['companies'].loc[self.labels_df['labels']==i].to_list()) for i in range(self.n_clusters)],
'ticker_symbol': [" ".join(self.labels_df['ticker_symbol'].loc[self.labels_df['labels']==i].to_list()) for i in range(self.n_clusters)]
})
return df
def print_cluster_df(self):
return self.cluster_df.iloc[:, 0:2]
n_clusters = 8
clusters_gmm_reduced_x1 = Cluster(np.array(reduced_data_pca_x1).T, companies_dict, 'gaussian_mixture', n_clusters, reduced_data_pca_x1)
clusters_km_reduced_x1 = Cluster(np.array(reduced_data_pca_x1).T, companies_dict, 'k_means', n_clusters, reduced_data_pca_x1)
clusters_gmm_reduced_x1.fig.update_layout(title='Gaussian Mixture Model')
clusters_gmm_reduced_x1.fig.show()
clusters_km_reduced_x1.fig.update_layout(title='K-Means')
clusters_km_reduced_x1.fig.show()
Analyzing the plots above we can notice that both techniques performed reasonably well while clustering the data. We can also note the characteristic nature of these methods. Since K-means is a distance-based method, we can observe that the clusters formed are largely round in shape. This is largely because the centroids of the clusters are updated iteratively using the calculated centroid value. In complex data settings, this can be a huge drawback and will likely perform worse, i.e. create less meaningful clusters if the dataset had included thousands of companies from the stock market.
Conversely, since Gaussian Mixture is a probability distribution based clustering technique, it overcomes the above mentioned limitation and creates clusters that are not necessarily circular, we observe this from the plots above as well. In complex data settings, while GMM may perform better than k-means, the computation heavy nature of the EM algorithm prevents it from scaling well for thousands of datapoints.
Below are the average computation times of both these algorithms, we can observe that GMM takes 300% - 500% more time than the k-means algorithm.
%timeit Cluster(x1, companies_dict, 'gaussian_mixture', n_clusters, reduced_data_pca_x1)
%timeit Cluster(x1, companies_dict, 'k_means', n_clusters, reduced_data_pca_x1)
Below are the companies that have been clustered together by the gaussian mixture model. The companies share resemblance in various factors, be it the sector, market-cap or price-to-earnings ratio. We believe this is due to the large number of institutional investors trading in these companies together mostly in huge market funds called ETFs which stands for Exchange Traded funds.
clusters_gmm_reduced_x1.print_cluster_df()
We now try to predict the closing price of a stock based on the historical data using the following two methods.
Feature Engineering
We try to feature engineer two variables, High low percentage, and percentage change, to predict the stock prices [7]
.
Data Preparation Once we have created the two new variables, we do the following steps
Data modeling
Data Prediction
def find_stock_price(ticker_name, start, end):
# get data
data = yf.download(ticker_name, start=start, end=end)
new_data = data[['Adj Close']]
new_data['poct_change'] = (data['Close'] - data['Open']) / data['Open'] * 100
new_data['high_low_per'] = (data['High'] - data['Low']) /data['Close'] * 100
# prepare the data
n = len(new_data)
n = round(0.1*(n))
forecast_out = n
# get forecasting column
forecast_col = 'Adj Close'
new_data['label'] = new_data[forecast_col].shift(-forecast_out)
X = np.array(new_data.drop(['label'], 1))
X = preprocessing.scale(X) # process data
X = X[:-forecast_out] # remove our the data to forecast
X_late = X[-forecast_out:]
y = np.array(new_data['label'])
y = y[:-forecast_out]
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70)
#model
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
# get score
confi = clf.score(X_test, y_test)
forecast_set = clf.predict(X_late)
new_data['Forecast'] = np.nan
# get the graph
last_date = new_data.iloc[-1].name
last_unix = last_date
next_unix = last_unix + timedelta(days=1)
for i in forecast_set:
next_date = next_unix
next_unix += timedelta(days=1)
new_data.loc[next_date] = [np.nan for _ in range(len(new_data.columns)-1)]+[i]
a=new_data.tail(500)
b=new_data.tail(500)
a=a.reset_index()
b=b.reset_index()
fig = go.Figure()
fig.add_scatter(x=a['Date'], y=a['Adj Close'],name='Actual Values')
fig.add_scatter(x=b['Date'], y=b['Forecast'],mode='lines',name='Forecast')
return fig, clf.score(X_test, y_test)
random.seed(222)
fig, score = find_stock_price(ticker_name='AAPL', start="2020-03-02", end="2021-02-01")
fig.show()
Time Series Forcasting with Prophet
In predictive analytics, tuning the model's various parameters to suit a particular decision making problem is likely most of the time is spent by practitioners, and it is perhaps the most challenging task. Furthermore, it requires a complete understanding of the data, and its source factors along with the time series models inner workings.
Prophet [6] is a complex regressor based time-series forecasting model with intuitive parameters which are easy to tune, it is distributed as an open-source library in R and Python. It uses a model based on decomposability $(trend+seasonality+holidays)$. It provides us with the ability to make time-series predictions with good accuracy using simple, intuitive parameters. We aim to contrast our simplistic linear regression model's forecast with such an industry-backed (Facebook) model with real world utility. Now we try to forecast the closing prices of a given stock using the Prophet model.
Data: As we have done in linear regression, we forecast the closing prices based on the historical closing price data. The Prophet model takes data in the form of a pandas DataFrame object, the dataframe is made up two columns, First is is $Date$ which contains a datestamp, the other is $y$ which contains the value we need to forecast. Furthermore, we do not split the data into testing, training. Instead, we use all the data to fit the model and then predict the future values, i.e.,the stock prices for a given period.
Model: In prophet, we use a decomposable time series model by Harvey & Peters (1990) [9] with three main model components: trend, seasonality, and holidays. They are combined in the following equation: $$y(t) = g(t) + s(t) + h(t) + \epsilon$$ Here $g(t)$ is the trend function which models non-periodic changes in the value of the time series, $s(t)$ represents periodic changes (e.g., weekly and yearly seasonality), and $h(t)$ represents the effects of holidays which occur on potentially irregular schedules over one or more days. The error term $\epsilon$ represents any idiosyncratic changes which are not accommodated by the model; later, we make the parametric assumption that $\epsilon$ is normally distributed. We do not dive deep into the model as it is out of scope for this project.
Once we fit the dataframe to the model, we use predict to forecast. As a result, we are given a data frame with predicted values. Among those values are the boundaries of the uncertainty interval (lower and upper). This is plotted as a price forecast graph, which shows the forecasted prices between the lower and upper uncertainty intervals along with the actual prices. As we can see from our graph, the predicted values do lie with the lower and upper uncertainty intervals.
def forecast_stock_prices(data, ticker_name, n_days):
start_day = data.index[-1].strftime('%Y-%m-%d')
end_day = (data.index[-1] + pd.DateOffset(days=60)).strftime('%Y-%m-%d')
df = data['Adj Close'][[ticker_name]].reset_index().rename(columns={'Date':'ds', ticker_name:'y'})
df['y'] = np.log(df['y'])
model = Prophet()
model.fit(df)
preds = model.make_future_dataframe(periods=n_days)
forecast = model.predict(preds)
forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
ground_truth = yf.download([ticker_name], start=start_day, end=end_day)[['Adj Close']].reset_index().rename(columns={'Date':'ds', 'Adj Close':'y'})
return forecast, ground_truth, df
def plot_forecast(ticker_name, forecast, X, ground_truth):
trace_yhat = go.Scatter(
x = forecast["ds"],
y = np.exp(forecast["yhat"]),
mode = 'lines',
name="Forecast"
)
trace_yhat_upper = go.Scatter(
x = forecast["ds"],
y = np.exp(forecast["yhat_upper"]),
mode = 'lines',
fill = "tonexty",
line = {"color": "#57b8ff"},
name="Higher uncertainty interval"
)
trace_yhat_lower = go.Scatter(
x = forecast["ds"],
y = np.exp(forecast["yhat_lower"]),
mode = 'lines',
fill = "tonexty",
line = {"color": "#57b8ff"},
name="Lower uncertainty interval"
)
trace_x = go.Scatter(
x = X["ds"],
y = np.exp(X["y"]),
name="Data values"
)
trace_gt = go.Scatter(
x = ground_truth["ds"],
y = ground_truth["y"],
name="Actual forecast values"
)
data = [trace_yhat, trace_yhat_upper, trace_yhat_lower, trace_x, trace_gt]
layout = go.Layout(title=f"{ticker_name} Stock Price Forecast", xaxis_rangeslider_visible=True)
fig = go.Figure(data=data, layout=layout)
return fig
preds, ground_truth, X = forecast_stock_prices(data, 'AAPL', 60)
plot_forecast('AAPL', preds, X, ground_truth)
We now take a look at the stock price variation of the companies per cluster and notice the similarities in their trajectories
def plot_cluster_stock_price(labels_df):
subps = make_subplots(rows=2, cols=4)
n_cols = 4
for ticker, label in zip(labels_df['ticker_symbol'], labels_df['labels']):
subps.add_trace(go.Scatter(x=data.index, y=data['Adj Close'][ticker], name=ticker), row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_xaxes(title_text="Cluster: {}".format(label), row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_yaxes(title_text="stock price", type="log", row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_layout(
title=f"Stock prices for {data.index[0].strftime('%B %Y')} - " \
+ f"{data.index[-1].strftime('%B %Y')}")
return subps
fig = plot_cluster_stock_price(clusters_gmm_reduced_x1.labels_df)
fig.show()
This plot shows the cumulative returns for all the different stock clusters. A cumulative return on investment can be defined as the aggregate amount that the investment has gained or lost over time, independent of the time involved.[1] . This expected growth is calculated by finding the percentage change of the adjusted stock price for that particular stock. Next, we calculate its cumulative returns using the $cumprod$ function in python. It shows how investing $100 with that particular stock could yield at the end of the period. Additionally, looking at the return of investment at the end of the period can help us evaluate a company's growth potential. If the returns are positive, we could say the company in that period had growth, while conversely, if the returns are negative, we can conclude that the company did not perform well for that period. Below is the plot of Growth on investment for each of the clusters we obtained earlier.
def plot_stock_growth(labels_df):
subps = make_subplots(rows=2, cols=4)
n_cols = 4
for ticker, label in zip(labels_df['ticker_symbol'], labels_df['labels']):
monthly_returns = data['Adj Close'][ticker].pct_change()
monthly_returns=(monthly_returns + 1)
m=monthly_returns.cumprod()
m.iloc[0, :] = 1
m = 100 * m
subps.add_trace(go.Scatter(x=m.index, y=m, name=ticker), row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_xaxes(title_text='', row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_yaxes(title_text="Expected Returns", row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_layout(
title=f"Growth of Investment on $100 during the period of {data.index[0].strftime('%B %Y')} - " \
+ f"{data.index[-1].strftime('%B %Y')}")
return subps
fig = plot_stock_growth(clusters_gmm_reduced_x1.labels_df)
fig.show()
In the above plot, we are trying to plot the chart of risk vs. the return comparisons for different stock clusters. Expected return measures the mean, or expected value, of the probability distribution of investment returns. The expected return of a portfolio or a stock is calculated by multiplying each asset's weight by its expected return and adding each investment's values.[1] For the return rate, use the returns' average, while the risk is calculated through the standard deviation of the returns. Ideally, we would always like to minimize the risk and maximize returns; therefore, if we were to draw an imaginary risk-reward tolerance line across each of the plots, then we would buy all stocks below the imaginary line and sell stocks above this line.
def plot_expected_return(data, cluster_df):
subps = make_subplots(rows=2, cols=4)
n_cols = 4
for label, tickers in zip(cluster_df['cluster'], cluster_df['ticker_symbol']):
retscomp = data['Adj Close'][[ticker for ticker in tickers.split()]].pct_change()
subps.add_trace(go.Scatter(x=retscomp.mean(), y=retscomp.std(), text=retscomp.columns, mode='markers'), row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_xaxes(range=[0.0, 0.02] ,title_text="Cluster: {} Expected Returns".format(label), row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_yaxes(range =[0.0, 0.1], title_text="Risk", row=(label//n_cols)+1, col= 1 if label==n_cols else (label%n_cols)+1)
subps.update_layout(
title=f"Expected returns vs risk during the period of {data.index[0].strftime('%B %Y')} - " \
+ f"{data.index[-1].strftime('%B %Y')}",
)
return subps
fig = plot_expected_return(data, clusters_gmm_reduced_x1.cluster_df)
fig.show()
The clustering behavior of both the techniques on the higher dimension feature vectors and comparision with the clusters formed from the lower dimension features calculated using PCA.
n_clusters = 8
clusters_gmm_x1 = Cluster(x1, companies_dict, 'gaussian_mixture', n_clusters, reduced_data_pca_x1)
clusters_gmm_x2 = Cluster(x2, companies_dict, 'gaussian_mixture', n_clusters, reduced_data_pca_x1)
clusters_km_x1 = Cluster(x1, companies_dict, 'k_means', n_clusters, reduced_data_pca_x1)
clusters_km_x2 = Cluster(x2, companies_dict, 'k_means', n_clusters, reduced_data_pca_x1)
clusters_gmm_reduced_x1 = Cluster(np.array(reduced_data_pca_x1).T, companies_dict, 'gaussian_mixture', n_clusters, reduced_data_pca_x1)
clusters_gmm_reduced_x2 = Cluster(np.array(reduced_data_pca_x2).T, companies_dict, 'gaussian_mixture', n_clusters, reduced_data_pca_x1)
clusters_km_reduced_x1 = Cluster(np.array(reduced_data_pca_x1).T, companies_dict, 'k_means', n_clusters, reduced_data_pca_x1)
clusters_km_reduced_x2 = Cluster(np.array(reduced_data_pca_x2).T, companies_dict, 'k_means', n_clusters, reduced_data_pca_x1)
def plot_clusters(cluster_plots: list, n_clusters):
subps = make_subplots(rows=4, cols=2, subplot_titles=("GMM x1", "GMM x2", "K-Means x1", "K-Means x2",
"GMM x1_pca", "GMM x2_pca", "K-Means x1_pca", "K-Means x2_pca"))
for i in range(8):
subps.append_trace(cluster_plots[0].fig['data'][i], row=1, col=1)
subps.append_trace(cluster_plots[1].fig['data'][i], row=1, col=2)
subps.append_trace(cluster_plots[2].fig['data'][i], row=2, col=1)
subps.append_trace(cluster_plots[3].fig['data'][i], row=2, col=2)
subps.append_trace(cluster_plots[4].fig['data'][i], row=3, col=1)
subps.append_trace(cluster_plots[5].fig['data'][i], row=3, col=2)
subps.append_trace(cluster_plots[6].fig['data'][i], row=4, col=1)
subps.append_trace(cluster_plots[7].fig['data'][i], row=4, col=2)
subps.update_layout({'height':1600})
return subps
plot_clusters([clusters_gmm_x1, clusters_gmm_x2, clusters_km_x1, clusters_km_x2, clusters_gmm_reduced_x1, clusters_gmm_reduced_x2, clusters_km_reduced_x1, clusters_km_reduced_x2], n_clusters)